5 Diagnostics and Analysis

5.1 Investigate high variance wells

nrow.source = function(df, facilityname, sourcename) {
  stopifnot(length(sourcename)==1)
  return(nrow(df %>% filter(well==facilityname, source==sourcename)))
}
well_summaries = outframe %>%
  filter(facility %in% well_names, variable=="mf_pred") %>%
  group_by(facility) %>%
  summarise(mean = mean(value),
            sd = sd(value),
            n_test = nrow.source(regression_df, unique(facility),"Well Tests"),
            n_pi = nrow.source(regression_df, unique(facility), "PI Database"),
            use.test = ifelse(n_test>0, "Test data", "No test data"),
            use.pi = ifelse(n_pi>0, "PI data", "No PI data")) %>%
  arrange(desc(sd))
well_summaries$production.curve = ifelse(well_summaries$facility %in% liq_wells, "Production curve", "Time series")

fp_summaries = list(fp14 = well_summaries %>% filter(facility %in% flows_to(censor('fp14', 'fp'))),
                    fp15 = well_summaries %>% filter(facility %in% flows_to(censor('fp15', 'fp'))),
                    fp16 = well_summaries %>% filter(facility %in% flows_to(censor('fp16', 'fp'))))
for (fp in names(fp_summaries)) {
  print(xtable(fp_summaries[[fp]] %>% select(-c(use.test, use.pi, production.curve)),
               type = "latex",
               caption=paste("Data methods feeding flash plant", censor(fp, 'fp')),
               label=paste0("tab:well_summaries_", fp)),
      table.placement = "H",
      file = paste0("../_media/summaries_", fp, ".tex"))
}

n_summaries = well_summaries %>%
  group_by(use.pi, use.test) %>%
  count()

sourceplot = ggplot(well_summaries, aes(x=1, y=log(sd))) +
  geom_boxplot(fill='steelblue') +
  geom_label(data=n_summaries, aes(x=-Inf, y=-Inf, hjust=0, vjust=0, label=paste0("n=", n), family="Times New Roman"), label.size=0, fill='white') +
  facet_grid(.~ use.pi + use.test) +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  labs(title="Differences in Production Error by Data Source", x="Production curve data source", y="log(standard deviation)")# +
  # ggsave('../_media/error_source.png', width=24.7*0.5, height=6, units='cm')
ggplotly(sourceplot)
sourcetab = well_summaries %>% select(facility, mean, sd, n_test, n_pi)
# print(xtable(sourcetab %>% head(), type = "latex",
#              caption="Upon inspection of the wells with the most variance, there is no immediate cause for high variance. This requires further investigation.",
#              label="tab:well_summaries"),
#       table.placement = "h",
#       file = "../_media/well_summaries.tex")
kable(cbind(sourcetab)) %>%
  kable_styling() %>%
  scroll_box(height = "400px")
facility mean sd n_test n_pi
wk86 81.5344741 61.3421227 4 0
wk28 88.7191863 49.1865621 26 0
wk242 321.9122120 46.6096937 28 0
wk65 26.7570149 42.1490342 2 0
wk27 163.8370066 37.6843482 36 0
wk81 60.1607344 27.4466959 24 0
wk116 76.0484841 26.6558492 10 0
wk26b 112.4015979 25.6578444 13 0
wk76 124.3262822 23.0842652 31 0
wk123 184.1871324 22.9574870 31 0
wk67 92.9119044 22.1894908 26 0
wk59 86.9360822 22.1636942 31 0
wk96 27.8387902 20.8344581 10 0
wk72 151.6238128 20.6750130 26 0
wk71 127.4461328 19.8326786 32 0
wk256 218.9385203 18.8662810 32 30
wk268 90.9012237 17.4071984 27 30
wk266 326.5706599 17.0555332 29 30
wk264 393.5393997 16.4000437 34 30
wk245 418.2219228 15.5064443 41 30
wk26a 119.5009793 14.5287864 16 0
wk265 335.9188636 14.4117511 34 30
wk267 308.6025505 14.0059520 31 30
wk255 377.3065824 13.9818617 41 30
wk83 133.4618281 13.4622589 26 0
wk74 156.6504648 12.7175994 26 0
wk269 133.2196999 11.0219560 25 30
wk235 206.0957866 10.3933763 20 0
wk70 151.4158667 10.0002375 45 0
wk55 65.1317321 9.9032863 47 0
wk262 633.1256048 9.5574724 36 30
wk244 196.7817051 9.4283939 37 30
wk229 201.3411656 9.1752904 92 0
wk124 252.5206841 8.9259854 31 0
wk222 105.9781842 8.4414258 44 0
wk243 375.1448829 8.2169925 52 30
wk247 232.9289526 6.6020166 48 30
wk271 86.5929859 6.5142536 6 30
wk207 150.3506614 6.2642318 18 0
wk250 25.9639557 5.9004806 0 30
wk239 132.8546240 5.8128930 18 0
wk46 43.9515350 5.2028818 18 0
wk260 295.9918124 5.0412982 33 30
wk261 250.3107907 4.6730155 37 30
wk253 269.9234974 4.5727659 32 30
wk270 460.2166869 4.0465622 5 30
wk258 198.6058922 3.2505974 38 30
wk263 221.6051927 3.2470226 34 30
wk259 309.0674371 3.1877495 37 30
wk272 208.3516971 3.0291599 7 30
wk254 226.6098609 2.3835320 39 30
wk605 34.4044858 1.3023662 0 0
wk606 22.5021979 1.0342556 0 0
wk237 14.3441589 0.9160045 0 30
wk233 13.6224547 0.5723686 0 30
wk241 45.4437549 0.4758516 0 30
wk249 1.1520835 0.3806202 0 0
wk238 60.2461360 0.3380969 0 30
wk610 6.6595603 0.2506634 0 0
wk607 1.4406459 0.2482484 0 0
wk234 30.9725253 0.2376167 0 30
wk25 6.6234142 0.1906251 0 0
wk251 22.6509078 0.1783918 0 30
wk240 23.5145992 0.1445570 0 30
wk228 14.9865455 0.1435981 0 30
wk236 26.3738049 0.0825879 0 0
wk216 8.1298059 0.0760767 0 0
wk232 0.0847269 0.0734232 0 30
wk252 58.8881359 0.0566079 0 30
wk118 9.5054893 0.0345031 0 0
wk604 0.7825628 0.0285853 0 0
wk92 0.0023263 0.0023603 0 0
wk101 0.0017559 0.0013574 0 0
wk66 0.0000000 0.0000000 0 0
wk88 0.0000000 0.0000000 0 0

5.2 Regression Fits

prod = as.data.frame(outmatrix) %>%
  select(contains('prod')) %>%
  gather(key=facility, value=value) %>%
  mutate(which=parse_number(facility)) %>%
  mutate(whp=data$whp_prod[which],
         well = names(ids)[data$well_id_prod[which]]) %>%
  rename(mf=value) %>%
  group_by(well, whp) %>%
  summarise(lower=quantile(mf, 0.025),
            upper=quantile(mf, 0.975),
            mean=mean(mf))

plotdata = regression_df %>%
  filter(well_id %in% ids[production_curve_wells]) %>%
  mutate(datetime = factor(as.Date(date))) %>%
  mutate(source = factor(source, levels=c("Well Tests", "PI Database")))

# regression plot
regplot = ggplot(prod, aes(x=whp)) +
  geom_line(aes(y=mean, color=well)) +
  geom_ribbon(aes(ymin=lower, ymax=upper, fill=well), alpha=0.25) +
  geom_point(data=plotdata, aes(y=mf, color=well, size=date, shape=source), alpha=0.5) +
  labs(title="Linear Regression on Test and PI Data", x="Well-head pressure (bar)", y="Mass flow (T/h)", color="Well", shape="Data source", size="Date", fill="Well") +
  coord_cartesian(xlim=c(min(plotdata$whp)*0.9,max(plotdata$whp)*1.1), ylim=c(0,max(plotdata$mf)*1.1))# +
  # ggsave('../_media/production_curve.png', width=24.7*0.48, height=24.7*0.48, units='cm')
ggplotly(regplot)

5.3 Time Series Plots

tsplotwells = ar_wells
ts_fit = as.data.frame(outmatrix) %>%
  select(contains('mf_ts')) %>%
  gather() %>%
  mutate(index = parse_number(key)) %>% select(-key) %>%
  group_by(index) %>%
  summarise(lower=quantile(value, 0.025),
            upper=quantile(value, 0.975),
            mean=mean(value)) %>%
  cbind(ts) %>%
  mutate(well = factor(names(ids[well_id_ts])),
         date_numeric = date_numeric_ts)

# actual observations
tsplotdata = dry_df %>%
  filter(well_id %in% ids[tsplotwells]) %>%
  mutate(datetime = factor(as.Date(date)),
         facility = well)

# experimental AR1 time series
ar_fit = as.data.frame(outmatrix) %>%
  select(contains("mu_ar")) %>%
  gather() %>%
  mutate(date_numeric = as.numeric(str_extract(key, "(?<=\\[)(.*?)(?=,)")) + min(dry_df$date_numeric) - 1,
         facility = names(ids)[as.numeric(str_extract(key, "(?<=,)(.*?)(?=\\])"))]) %>%
  select(facility, date_numeric, value) %>%
  group_by(facility, date_numeric) %>%
  summarise(mean=mean(value),
            lower=quantile(value, 0.025),
            upper=quantile(value, 0.975)) %>%
  filter(facility %in% tsplotwells)

# experimental EMA time series
ewma_fit = as.data.frame(outmatrix) %>%
  select(contains("mu_ema")) %>%
  gather() %>%
  mutate(date_numeric = as.numeric(str_extract(key, "(?<=\\[)(.*?)(?=,)")) + min(dry_df$date_numeric) - 1,
         facility = names(ids)[as.numeric(str_extract(key, "(?<=,)(.*?)(?=\\])"))]) %>%
  select(facility, date_numeric, value) %>%
  group_by(facility, date_numeric) %>%
  summarise(mean=mean(value),
            lower=quantile(value, 0.025),
            upper=quantile(value, 0.975)) %>%
  filter(facility %in% tsplotwells)

# find plot limits
tsmax = max(c(ts_fit$upper, ar_fit$upper, ewma_fit$upper))

lintsplot = ggplot(ts_fit, aes(x=date_numeric, color=well, fill=well)) +
  geom_line(aes(y=mean), linetype="dashed") +
  geom_ribbon(aes(ymin=lower, ymax=upper), color=NA, alpha=0.25) +
  geom_line(data=tsplotdata, aes(y=mf)) +
  geom_vline(aes(xintercept = max(tsplotdata$date_numeric)), linetype="dashed", color="red") +
  coord_cartesian(ylim=c(0, 60)) +
  labs(title=paste("Linear Time Series Regression for Selected Wells in PI"), x="Days since baseline (2000)", linetype="")# +
  # ggsave('../_media/dry_time_series.png', width=24.7, height=8, units='cm')

arplot = ggplot(ar_fit %>% filter(facility %in% tsplotwells), aes(x=date_numeric, y=mean, fill=facility, color=facility)) +
  geom_line(data=tsplotdata, aes(y=mf)) +
  geom_ribbon(aes(ymin=lower, ymax=upper), color=NA, alpha=0.5) +
  geom_line(linetype="dashed") + coord_cartesian(ylim=c(0, 60)) +
  geom_vline(aes(xintercept = max(tsplotdata$date_numeric)), linetype="dashed", color="red") +
  labs(title="AR(1) Experiment", x="Days since first date", y="Mass flow (T/h)") +
  ggsave('../_media/ar_experiment.png', width=24.7, height=8, units='cm')

ewmaplot = ggplot(ewma_fit, aes(x=date_numeric, y=mean, fill=facility, color=facility)) +
  geom_line(data=tsplotdata, aes(y=mf)) +
  geom_ribbon(aes(ymin=lower, ymax=upper), color=NA, alpha=0.5) +
  geom_line(linetype="dashed") + coord_cartesian(ylim=c(0, 60)) +
  geom_vline(aes(xintercept = max(tsplotdata$date_numeric)), linetype="dashed", color="red") +
  labs(title="EWMA Experiment", x="Days since first date")# +
  # ggsave('../_media/ewma_experiment.png', width=24.7, height=8, units='cm')

ggplotly(lintsplot)
ggplotly(arplot)
ggplotly(ewmaplot)
tsgrob = grid_arrange_shared_legend(lintsplot, arplot, ewmaplot, nrow=3, ncol=1, position = "bottom")

tsgrob
## TableGrob (2 x 1) "arrange": 2 grobs
##   z     cells    name              grob
## 1 1 (1-1,1-1) arrange   gtable[arrange]
## 2 2 (2-2,1-1) arrange gtable[guide-box]
# ggsave('../_media/ts_experiment.png', tsgrob, width=24.7, height=24, units='cm')

5.4 Goodness of fit (OLS regression)

liq_fit = as.data.frame(outmatrix) %>%
  select(contains('mf_fit')) %>%
  gather(key='index', value='fitted') %>%
  mutate(index=as.integer(parse_number(index))) %>%
  group_by(index) %>%
  summarise(lower=quantile(fitted, 0.025),
            upper=quantile(fitted, 0.975),
            Fitted=mean(fitted),
            std=sd(fitted)) %>%
  cbind(regression_df) %>%
  mutate(`Standardised residual` = (Fitted-mf)/std,
         Well = factor(names(ids[well_id])),
         Observed = mf) %>%
  gather(key="key", value="value", `Standardised residual`, Observed) %>%
  select(Well, key, Fitted, value, source)

diagplot = ggplot(liq_fit, aes(x=Fitted, y=value)) +
  geom_point(aes(color=Well, shape=Well)) + scale_shape_manual(values = rep_len(1:25, length(unique(liq_fit$Well)))) +
  geom_smooth(color='black') +
  facet_wrap(~key, scales="free") +
  geom_hline(data=data.frame(key="Standardised residual", value=c(1.96,-1.96)), aes(yintercept=value), color='red') +
  geom_abline(data=data.frame(key="Observed", a = 1, b = 0), aes(slope = a, intercept=b), color='red') +
  # coord_cartesian(ylim=c(-4, 4)) +
  labs(title="Diagnostic Plots", x="Fitted mass flow (T/h)", y="") +
  theme(legend.position = "bottom") +
  guides(color=guide_legend(nrow=3, byrow=T), shape=guide_legend(nrow=3, byrow=T))# +
  # ggsave('../_media/diagnostics.png', width=24.7, height=12, units='cm')
ggplotly(diagplot)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
selectwells = liq_fit %>% group_by(Well, key) %>% summarise(fittedsd = sd(Fitted)) %>%
  arrange(desc(fittedsd)) %>% head(56*2) %>% pull(Well)

observedplot = ggplot(liq_fit %>% filter(key=="Observed", Well %in% selectwells), aes(x=Fitted, y=value)) +
  geom_point(aes(color=source), alpha=0.5) +
  geom_smooth(color=NA, alpha=0.5) +
  facet_wrap(~Well, scales="free") +
  # geom_hline(data=data.frame(key="Standardised residual", value=c(1.96,-1.96)), aes(yintercept=value), color='red') +
  geom_abline(data=data.frame(key="Observed", a = 1, b = 0), aes(slope = a, intercept=b)) +
  labs(title="Linear Regression Fit Plots Per Well", x="Fitted mass flow (T/h)", y="Observed mass flow (T/h)", color="Data source") +
  theme(legend.position = "bottom")# +
  # guides(color=guide_legend(nrow=3, byrow=T), shape=guide_legend(nrow=3, byrow=T)) +
  # ggsave('../_media/observed.png', width=24.7, height=24.7, units='cm')

stdresplot = ggplot(liq_fit %>% filter(key=="Standardised residual", Well %in% selectwells), aes(x=Fitted, y=value)) +
  geom_point(aes(color=source), alpha=0.5) +
  geom_smooth(color=NA, alpha=0.5) +
  facet_wrap(~Well, scales="free_x") +
  geom_hline(data=data.frame(key="Standardised residual", value=c(1.96,-1.96)), aes(yintercept=value), color='red') +
  # geom_abline(data=data.frame(key="Observed", a = 1, b = 0), aes(slope = a, intercept=b), color='red') +
  labs(title="Linear Regression Residual Plots Per Well", x="Fitted mass flow (T/h)", y="Standardised residual", color="Data source") +
  coord_cartesian(ylim=c(-5, 5)) + theme(legend.position="bottom")# +
  # guides(color=guide_legend(nrow=3, byrow=T), shape=guide_legend(nrow=3, byrow=T)) +
  # ggsave('../_media/stdres.png', width=24.7, height=24.7, units='cm')

ggplotly(observedplot)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplotly(stdresplot)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# stdres_min = liq_fit %>% filter(key=="Standardised residual") %>% pull(value) %>% min()
# stdres_max = liq_fit %>% filter(key=="Standardised residual") %>% pull(value) %>% max()
# ggplot(liq_fit %>% filter(key=="Standardised residual"), aes(x=value)) +
#   geom_density(fill="red", alpha=0.5, color=NA) +
#   geom_line(data=data.frame(x=seq(stdres_min, stdres_max, length.out=100)), aes(x=x, y=dnorm(x)))

5.5 Limits and Constraint Violations

sf.df <- outframe %>% 
  filter(str_detect(variable, "total_sf") & value > 0) %>% 
  droplevels()
limits = fp_constants %>%
  mutate(facility = names(ids)[fp_id]) %>%
  select(facility, limit) %>% 
  drop_na()

p.limits = sf.df %>%
  left_join(limits, by=c("facility")) %>%
  mutate(greater = value > limit) %>%
  group_by(facility) %>%
  summarise(p.greater = mean(greater)) %>%
  drop_na()

limitplot = ggplot(sf.df %>% filter(facility %ni% incomplete.fps), aes(x=value, fill=facility)) +
  facet_wrap(~facility, scales = "free_y", ncol=2) +
  geom_density(alpha=0.5, color=NA) +
  geom_vline(data=limits, aes(xintercept=limit), color="red") +
  geom_label(data=p.limits %>% filter(facility %ni% incomplete.fps), aes(x=-Inf, y=Inf, hjust=0, vjust=1, label=paste0("p(>lim)=", p.greater), family="Times New Roman"), color="black", label.size=0, fill='white') +
  theme(legend.position="none",
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
  labs(title="Posterior Flash Plant Mass Flows", x="Steam flow (T/h)", y="Density", fill="Flash plant", color="Steam flow limit")# +
  # ggsave('../_media/constraints.png', width=24.7, height=10, units='cm')
ggplotly(limitplot)

5.6 Flow Comparison

flow.df <- outframe %>% 
  filter(facility %in% fp_names) %>%
  filter(str_detect(variable, "mf_pred|ip_sf|lp_sf|wf") & value > 0) %>%
  mutate(variable=ifelse(variable=="mf_pred", "mf", variable),
         variable=factor(variable, levels=c("mf", "ip_sf", "lp_sf", "wf")))

comparison = fp_constants %>% select("fp", contains("verification")) %>%
  rename(facility=fp) %>%
  gather(key="variable", value="value", -facility) %>%
  mutate(variable = gsub("^verification_", "", variable),
         variable=factor(variable, levels=c("mf", "ip_sf", "lp_sf", "wf"))) %>%
  drop_na()

ps = flow.df %>%
  left_join(comparison, by=c("facility", "variable")) %>%
  mutate(greater = value.x > value.y) %>%
  group_by(facility, variable) %>%
  summarise(p.greater = mean(greater)) %>%
  mutate(variable=factor(variable, levels=c("mf", "ip_sf", "lp_sf", "wf"))) %>%
  drop_na()

verificationplot = ggplot(flow.df %>% filter(facility %ni% incomplete.fps), aes(x=value)) +
  geom_density(aes(y=..scaled.., fill=variable, color=variable), alpha=0.5, show.legend=F) +
  geom_vline(data=comparison %>% filter(facility %ni% incomplete.fps), aes(xintercept=value)) +
  geom_label(data=ps %>% filter(facility %ni% incomplete.fps), aes(x=-Inf, y=Inf, hjust=0, vjust=1, label=paste0("p(>x)=", p.greater), family="Times New Roman"), label.size=0) +
  facet_grid(facility~variable, scales="free", space="free_y") +
  theme(axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
  labs(x="Value", y="Scaled density", title="Comparison Between Predicted FP Flows and Sample Data")# +
  # ggsave('../_media/verification.png', width=24.7, height=20, units='cm')
ggplotly(verificationplot)